#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/5/25 12:53
# @Author : CoderXYX
# @Site : 
# @File : tencent_danmu_spider.py
# @Software: PyCharm

import os
import time
import json
import random

import jieba
import requests
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# 词云字体
WC_FONT_PATH = '/Fonts/STXINWEI.TTF'

# 弹幕数据保存文件
DANMU_FILE_PATH = 'tencent_danmu.txt'

def spider_danmu():
    """
    爬取腾讯指定页的弹幕
    :param vid: 每集id
    :param page: 页数
    :return: 1 爬取成功 0 爬取失败或结束
    """
    url = 'https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19108705083609680488_1590402246541&target_id=5219759977%26vid%3Db0033m9le2c&session_key=49397%2C266%2C1590421597&timestamp=45&_=1590402246653'
    kv = {'user-agent': 'Mozilla/5.0',
          'Referer': 'https://v.qq.com/x/cover/mzc00200iseomew.html'}
    try:
        r = requests.get(url, headers=kv)
        r.raise_for_status()
    except:
        print('爬取失败')
    # 找到jsonp数据的左括号位置并+1
    json_start_index = r.text.index('(') + 1
    # 找到最后一个)的位置
    json_end_index = r.text.rindex(')')
    # 截取json数据字符串
    r_json_str = r.text[json_start_index:json_end_index]
    # r_json_str = r.text[json_start_index:-1]
    # 字符串转json对象
    r_json_obj = json.loads(r_json_str, strict=False)
    # 如果请求的总数count=0则说明这集的弹幕爬取完成
    if not r_json_obj['count']:
        return 0
    # 获取弹幕列表数据
    r_json_comments = r_json_obj['comments']
    # 遍历评论对象列表
    for r_json_danmu in r_json_comments:
        with open(DANMU_FILE_PATH, 'a+', encoding='utf-8') as file:
            file.write(r_json_danmu['content'] + '\n')
        # 打印弹幕对象中的评论内容
        print(r_json_danmu['content'])
    return 1

def spider_danmu1(page, target_id):
    """
    爬取腾讯指定页的弹幕
    :param vid: 每集id
    :param page: 页数
    :return: 1 爬取成功 0 爬取失败或结束
    """
    url = 'https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19101386222438229079_1590393692281&target_id={1}%26vid%3Db0033m9le2c&session_key=48928%2C266%2C1590393693&timestamp={0}&_=1590393692297'.format(page, target_id)
    kv = {'user-agent': 'Mozilla/5.0',
          'Referer': 'https://v.qq.com/x/cover/mzc00200iseomew.html'}
    try:
        r = requests.get(url, headers=kv)
        r.raise_for_status()
    except:
        print('爬取失败')
    # 找到jsonp数据的左括号位置并+1
    json_start_index = r.text.index('(') + 1
    # 找到最后一个)的位置
    json_end_index = r.text.rindex(')')
    # 截取json数据字符串
    r_json_str = r.text[json_start_index:json_end_index]
    # r_json_str = r.text[json_start_index:-1]
    # 字符串转json对象
    r_json_obj = json.loads(r_json_str, strict=False)
    # 如果请求的总数count=0则说明这集的弹幕爬取完成
    if not r_json_obj['count']:
        return 0
    # 获取弹幕列表数据
    r_json_comments = r_json_obj['comments']
    # 遍历评论对象列表
    for r_json_danmu in r_json_comments:
        with open(DANMU_FILE_PATH, 'a+', encoding='utf-8') as file:
            file.write(r_json_danmu['content'] + '\n')
        # 打印弹幕对象中的评论内容
        print(r_json_danmu['content'])
    return 1

def batch_spider_danmu():
    """
    批量爬取腾讯弹幕
    """
    # 写入数据前先清空之前的数据
    if os.path.exists(DANMU_FILE_PATH):
        os.remove(DANMU_FILE_PATH)
    # 爬取所有集数的target_id
    target_ids = spider_vid()
    # print(target_ids)
    for target_id in target_ids:
        print(target_id)
        i = 0
        while spider_danmu(i, target_id):
            j = i / 30
            print('第%d页' % j + '\n')
            print('-------------------------------------------------------------------------------------------------' + '\n')
            # 模拟用户浏览，设置一个爬虫间隔，防止ip被封
            time.sleep(random.random() * 5)
            i += 30
    print('爬取完毕')

def spider_vid():
    """
    爬取所有集数的id
    :return:
    """
    url = 'https://union.video.qq.com/fcgi-bin/data?otype=json&tid=682&appid=20001238&appkey=6c03bbe9658448a4&union_platform=1&idlist=c0033wynpk2,b0033m9le2c,h0033h6xlpw,c0033wynpk2,a0033tyg4yc,c0033xlx07n,v0033hw1po1,e0033xar3x3,u00335p6qi8,o00331u1232,c0033wx25ue,k0033uupeq2,c0033pv0q3s,c0033jm4pje,b0033rqnq97,s0034tmrwym,l00347z2o2e,e00349umv6j,g0034acb2th,u0033frujqa,c0033huis2l,m0971iyt97g,y0034dhjuc3,w0971filfsm,r0971v42elk,e09719d3ozn,r097157d3g0,f0971bsiqr1,x0034tau1ns,w0034xu4294&callback=jQuery191026365599714048704_1590399699039&_=1590399699042'
    kv = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
        'Referer': 'https://v.qq.com/x/cover/mzc00200iseomew/h0033h6xlpw.html',
        'Cookie': 'pgv_pvi=8444654592; RK=dXa1MYoSfJ; ptcz=6f0c9a0223c5ea17256a845a3ca8030c16f1f1edd7b45942f979c37d3b2962b0; tvfe_boss_uuid=f06bd864efa19389; pgv_pvid=2905599416; ts_uid=2981984512; video_guid=5ddd71bd07d9a279; video_platform=2; login_remember=qq; main_login=qq; vqq_vuserid=116205083; vqq_openid=D6C8FB0BEB1BFFEDBDBBFEF723E830A4; vqq_appid=101483052; tvfe_search_uid=942cd314-14ad-47bc-bb90-5b9a3ae3ce87; mobileUV=1_16fb842040c_4f5e4; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22170199860be229-08d2c59828644e-b383f66-1049088-170199860bf4c9%22%2C%22%24device_id%22%3A%22170199860be229-08d2c59828644e-b383f66-1049088-170199860bf4c9%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; o_cookie=752834620; pac_uid=1_752834620; qq_nick=Coder-X; qq_head=http://thirdqq.qlogo.cn/g?b=oidb&k=Wm0FFcb2ibBSCKb9NzWx13A&s=640&t=1564631677; _ga=GA1.2.317541229.1584591123; bucket_id=9231002; vqq_access_token=6A65D83ABF9355C88AF435227A11261F; uid=52624220; pgv_info=ssid=s9024185296; qv_als=ds1nOr7lHg9x0W1uA11590394864C//TIQ==; ptag=|x; ts_last=v.qq.com/x/cover/mzc00200iseomew/h0033h6xlpw.html; vqq_vusession=6lovtYCCav1dM9dS1LYTJg..; ad_play_index=26'}
    try:
        r = requests.get(url, headers=kv)
        r.raise_for_status()
    except:
        print('爬取失败')
    # 提取json数据并转为json对象
    r_json_obj = jsonp_func_to_json_obj(r.text)
    # 获取电视剧简介集数
    video_results = r_json_obj['results']
    target_ids = []
    for v_ids in video_results:
        v_id = v_ids['id']
        target_id = spider_target_id(v_id)
        target_ids.append(target_id)
    return target_ids

def spider_target_id(v_id):
    headers = {'user-agent': 'Mozilla/5.0', 'Referer': 'https://v.qq.com/x/cover/mzc00200iseomew/h0033h6xlpw.html'}
    #target_id所在基础网址
    base_url = 'https://access.video.qq.com/danmu_manage/regist?vappid=97767206&vsecret=c0bdcbae120669fff425d0ef853674614aa659c605a613a4&raw=1'
    #传递参数，只需要改变后缀ID
    pay = {"wRegistType":2,"vecIdList":[v_id],"wSpeSource":0,"bIsGetUserCfg":1,     "mapExtData":{v_id:{"strCid":"wu1e7mrffzvibjy","strLid":""}}}
    html = requests.post(base_url,data = json.dumps(pay),headers = headers)
    bs = json.loads(html.text)
    # 定位元素
    danmu_key = bs['data']['stMap'][v_id]['strDanMuKey']
    # 解析出target_id
    target_id = danmu_key[danmu_key.find('targetid') + 9: danmu_key.find('vid') - 1]
    return target_id

def jsonp_func_to_json_obj(jsonp_func):
    """
    jsonp返回函数提取json并转为对象
    :param jsonp_func: jsonp请求返回的数据，格式：xxx(json)
    :return: json对象
    """
    # 找到jsonp数据的左括号位置并+1
    json_start_index = jsonp_func.index('(') + 1
    # 找到最后一个)的位置
    json_end_index = jsonp_func.rindex(')')
    # 截取json数据字符串
    r_json_str = jsonp_func[json_start_index:json_end_index]
    # 字符串转json对象
    return json.loads(r_json_str)

def cut_word():
    """
    对数据分词
    :return: 分词后的数据
    """
    with open(DANMU_FILE_PATH, 'r+', encoding='utf-8') as file:
        comment_txt = file.read()
        wordlist = jieba.cut(comment_txt, cut_all=False)
        wl = " ".join(wordlist)
        print(wl)
        return wl


def create_word_cloud():
    """
    生成词云
    :return:
    """
    # 数据清洗词列表
    stop_words = ['哈哈', '哈哈哈', '哈哈哈哈', '啊啊啊', '什么', '为什么', '不是', '就是', '还是', '真是', '这是', '是不是',
                  '应该', '不能', '这个', '电视','电视剧', '怎么',
                  '这么', '那么', '那个', '没有', '不能', '不知', '知道']
    # 设置词云的一些配置，如：字体，背景色，词云形状，大小
    wc = WordCloud(background_color="white", max_words=900, width=940, height=400, scale=10,
                   max_font_size=50, random_state=42, stopwords=stop_words, font_path=WC_FONT_PATH)
    # 生成词云
    wc.generate(cut_word())
    # 在只设置mask的情况下,你将会得到一个拥有图片形状的词云
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.figure()
    plt.show()

if __name__ == '__main__':
    # spider_danmu()
    # batch_spider_danmu()
    spider_danmu()
    create_word_cloud()